# merge lad bootstraps
library(haven)

n_bootstrap=1000

ladout=lapply(1982:2019, function(yy){
  
  print(yy)
  bootstrapout=lapply(1:n_bootstrap, function(ii){
    
    filename_ind=paste("H:/Zheng_10223/Joint/LAD/bootstrap/ind/pctind_sample",ii,"_year",yy,".dta", sep="")
    
    filename_parents=paste("H:/Zheng_10223/Joint/LAD/bootstrap/parents/pctparent_sample",ii,"_year",yy,".dta", sep="")
    
    dfind=read_dta(filename_ind)
    dfparents=read_dta(filename_parents)
    
    # merge individual and parents 
    dfmerge=merge(dfind,dfparents,by=c("year","bsamplenum","pct") )
    dfmerge=dfmerge[dfmerge$pct!=100,]
    
    # fix the column names 
    ind30name=paste("pctind_",ii,"age30",sep="")
    ind45name=paste("pctind_",ii,"age45",sep="")
    
    
    
    dfmerge$pctindage30=dfmerge[[ind30name]]
    dfmerge$pctindage45=dfmerge[[ind45name]]
    
    
    dfmerge=dfmerge[,c("year","bsamplenum","pct","pctindage30","pctindage45","pctparentage30","pctparentage45")]
    return(dfmerge)
  })
  
  
  allboots=do.call(rbind,bootstrapout)
  
  
  
  
})

alllad=do.call(rbind,ladout)

write.csv(alllad, "H:/Zheng_10223/Joint/LAD/bootstrap/all_lad_bootstrap.csv")


############## RUN FROM HERE ######################## 

library(data.table)
library(haven)
sample=fread("H:/Zheng_10223/Joint/samplefile_bootstrap.csv")


cpi <- read_dta("H:/Zheng_10223/Joint/cpi/cpi.dta")



# Read in all the Lad_bootstrap

# Income percentiles from the LAD (at year-age-level)
income_pct <- read.csv("H:/Zheng_10223/Joint/LAD/bootstrap/all_lad_bootstrap.csv", header=TRUE, stringsAsFactors = FALSE)
income_pct <- merge(x=income_pct, y=cpi, by.x="year", by.y="year", all.x=T)
income_pct$pctind_age30_TIRC_real <- income_pct$pctindage30*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct$pctind_age45_TIRC_real <- income_pct$pctindage45*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi




income_pct$pctparent_age45_TIRC_real <- income_pct$pctparentage45*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct <- data.table(income_pct)
income_pct <- income_pct[order(bsamplenum,year, pct)]



resampled_data <- lapply(1:1000, function(k) {
  set.seed(k) # set the seed
  
  
  resample <- sample[sample(nrow(sample), size = nrow(sample), replace = TRUE), ]
  # Get child income percentiles (using Canadian 30year old)
  
  
  # LAD INCOME_PCT ONLY STARTS IN 1982 
  for(i in 1982:2019){
    
    # Compare income with rank/percentile of average household income AMONG PARENTS
    # calculate percentiles 
    resample$Child_Income_IND_30_34_pct[which(resample$YearAt30==i)] <- findInterval(resample$Child_Income_IND_30_34[which(resample$YearAt30==i)], vec=income_pct$pctind_age30_TIRC_real[which(income_pct$year==i & income_pct$bsamplenum==k)])
    resample$Child_Income_IND_30_34_pct[which(resample$YearAt30==i & resample$Child_Income_IND_30_34_pct==min(resample$Child_Income_IND_30_34_pct[which(resample$YearAt30==i)], na.rm=T))] <-0.5*(1+resample$Child_Income_IND_30_34_pct[which(resample$YearAt30==i & resample$Child_Income_IND_30_34_pct==min(resample$Child_Income_IND_30_34_pct[which(resample$YearAt30==i)], na.rm=T))])
    
    
    resample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(resample$MainParentYearat45==i)] <- findInterval(resample$MainParent_Income_HH_MainParentAge45_49[which(resample$MainParentYearat45==i)], vec=income_pct$pctparent_age45_TIRC_real[which(income_pct$year==i & income_pct$bsamplenum==k)])
    resample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(resample$MainParentYearat45==i  & resample$MainParent_Income_HH_MainParentAge45_49_pctparent==min(resample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(resample$MainParentYearat45==i )], na.rm=T))] <-0.5*(1+resample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(resample$MainParentYearat45==i & resample$MainParent_Income_HH_MainParentAge45_49_pctparent==min(resample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(resample$MainParentYearat45==i )], na.rm=T))])
    
    
    print(i)
  }
  
  # run the three regressions of interest
  model_all= lm(Child_Income_IND_30_34_pct ~MainParent_Income_HH_MainParentAge45_49_pctparent, data=resample )
  model_refugee=lm(Child_Income_IND_30_34_pct ~MainParent_Income_HH_MainParentAge45_49_pctparent, data=resample[resample$Status_Refugee==TRUE,] )
  model_nonrefugee=lm(Child_Income_IND_30_34_pct ~MainParent_Income_HH_MainParentAge45_49_pctparent, data=resample[resample$Status_Refugee==FALSE,] )
  
  # Store the coefficients 
  all_coef <- as.data.frame( coef(summary(model_all)) )
  all_refugee <- as.data.frame( coef(summary(model_refugee)) )
  all_nonrefugee   <- as.data.frame( coef(summary(model_nonrefugee)) )
  
  all_coef$bootiter=k; all_coef$EstimateType[1]="Intercept"; all_coef$EstimateType[2]="Slope"
  all_refugee$bootiter=k; all_refugee$EstimateType[1]="Intercept"; all_refugee$EstimateType[2]="Slope"
  all_nonrefugee$bootiter=k; all_nonrefugee$EstimateType[1]="Intercept"; all_nonrefugee$EstimateType[2]="Slope"
  
  all_coef$model="all"
  all_refugee$model="refugee"
  all_nonrefugee$model="nonrefugee"
  
  dfout=rbind(all_coef,all_refugee)
  dfout=rbind(dfout,all_nonrefugee)
  
  return(dfout)
  
})
coefsout=do.call(rbind, resampled_data)

fwrite(coefsout, "H:/Zheng_10223/Joint/bootstrapcoefs1_1000.csv")


df=read.csv("H:/Zheng_10223/Joint/bootstrapcoefs1_1000.csv",header=TRUE, stringsAsFactors = FALSE)

# All 
mean(df$Estimate[df$model=="all" & df$EstimateType=="Intercept"])
sd(df$Estimate[df$model=="all" & df$EstimateType=="Intercept"])

mean(df$Estimate[df$model=="all" & df$EstimateType=="Slope"])
sd(df$Estimate[df$model=="all" & df$EstimateType=="Slope"])

# Refugee
mean(df$Estimate[df$model=="refugee" & df$EstimateType=="Intercept"])
sd(df$Estimate[df$model=="refugee" & df$EstimateType=="Intercept"])

mean(df$Estimate[df$model=="refugee" & df$EstimateType=="Slope"])
sd(df$Estimate[df$model=="refugee" & df$EstimateType=="Slope"])
# non Refugee
mean(df$Estimate[df$model=="nonrefugee" & df$EstimateType=="Intercept"])
sd(df$Estimate[df$model=="nonrefugee" & df$EstimateType=="Intercept"])
mean(df$Estimate[df$model=="nonrefugee" & df$EstimateType=="Slope"])
sd(df$Estimate[df$model=="nonrefugee" & df$EstimateType=="Slope"])